hmt <- load_block_group_data(load.cache = T)
15:45:09: Attempting to load cached data.
15:45:09: File at: C:/Users/justin.elszasz/Code/middle_neighborhoods_analysis/data/processed/hmt/hmt_join_acs.rds
cannot open compressed file 'C:/Users/justin.elszasz/Code/middle_neighborhoods_analysis/data/processed/hmt/hmt_join_acs.rds', probable reason 'No such file or directory'15:45:09: Cache does not exist. Creating cache.
15:45:09: Grabbing HMT geospatial table from EGIS server.
15:45:22: Formatting HMT geospatial table.
15:45:23: Loading ACS block group data from Excel sheets.
15:45:23: Joining ACS data to HMT geospatial table.
15:45:23: Directory does not exist. Creating directory.
15:45:23: Cache saved to C:/Users/justin.elszasz/Code/middle_neighborhoods_analysis/data/processed/hmt/hmt_join_acs.rds
15:45:23: HMT data succesfully loaded.
First need to join up the real property data (Open Baltimore) the sales data (provided by Steve, and with deed dates from January 1, 2010 through October 2018) so we have a neighborhood for as many sales as we can.
sales <- sales %>% rename(sales.block = Block, sales.lot = Lot)
real.prop <- real.prop %>% rename(real.block = block, real.lot = lot)
real.prop <- real.prop %>%
mutate(real.block.clean = gsub("^0+", "", real.block),
real.lot.clean = gsub("^0+", "", real.lot))
sales <- sales %>%
mutate(sales.block.clean = gsub("^0+", "", sales.block),
sales.lot.clean = gsub("^0+", "", sales.lot))
sales <- sales %>%
left_join(real.prop,
by = c("sales.block.clean" = "real.block.clean",
"sales.lot.clean" = "real.lot.clean")
)
sales %>% count(is.na(real.block), is.na(real.lot))
1,153 sales didn’t match to a block-lot in the real property table, which means that the block-lot jointly was not in the real prop table.
Also, there are about 16,000 properties in the real prop table that don’t have a neighborhood.
real.prop %>% count(is.na(neighborhood))
So after joining we end up with 9,428 sales that don’t have a neighborhood.
sales %>% count(!is.na(neighborhood))
The real property table also gives if it is principal residence or not, so we’ll also filter for the sales that are for principal residences.
sales %>% count(rescode)
Distribution of city-wide 2018 sales prices:
sales %>%
filter(year(deed.date) == 2018) %>%
ggplot(aes(`Sales Price`)) +
geom_histogram() +
theme_iteam_google_docs() +
xlim(c(0, 500000))

quantile(sales$`Sales Price`, 0.85)
85%
275000
Neighborhood Summary Table, 2015-2017
meet.criteria <- sales %>%
filter(year(deed.date) %in% c(2015, 2016, 2017),
!is.na(neighborhood),
`How Conveyed` == 1,
!grepl("NOT", rescode)) %>%
nrow
We have 16112 samples to work with that are in 2015-2017, have a neighborhood, were an arms-length sale, and are the principal residence.
sales.summary.15_17.by.hood <- sales %>%
filter(year(deed.date) %in% c(2015, 2016, 2017),
!is.na(neighborhood),
`How Conveyed` == 1,
!grepl("NOT", rescode)) %>%
group_by(neighborhood) %>%
summarise(hood.n = n(),
hood.mean = mean(`Sales Price`),
hood.median = median(`Sales Price`),
hood.std = sqrt(sum((`Sales Price`-hood.mean)^2/(hood.n-1))),
hood.95th = quantile(`Sales Price`, probs = .95),
hood.98th = quantile(`Sales Price`, probs = .98),
hood.99th = quantile(`Sales Price`, probs = .99))
sales.summary.15_17.by.hood
Which neighborhoods have less than 20 sales meeting the criteria?
sales.summary.15_17.by.hood %>%
filter(hood.n < 20)
84 neighborhoods have less than 20 sales meeting the criteria. We’ll exclude them going forward so we have a reasonable sample size.
# Join the summaries to the neighborhood boundaries
hoods@data <- hoods@data %>%
left_join(sales.summary.15_17.by.hood,
by = c("label" = "neighborhood"))
98th Percentile
Criteria & Results
sales.hood.98th <- sales %>%
left_join(sales.summary.15_17.by.hood,
by = c("neighborhood" = "neighborhood")) %>%
filter(year(deed.date) == 2018,
hood.n >= 20,
`Sales Price` >= hood.98th,
`How Conveyed` == 1,
!grepl("NOT", rescode)) %>%
arrange(neighborhood)
result.sales <- nrow(sales.hood.98th)
There are 167 sales that meet the following criteria:
- Deed date was between January 1, 2018 and October 5, 2018
- Arms-length sale
- Principal residence
- Neighborhood had at least 20 sales
- 98th percentile for sales prices for their neighborhood.
(If this yield isn’t high enough we can bump it down to the 95th percentile.)
sales.hood.98th$long <- lapply(sales.hood.98th$location.coordinates, function(x) x[1]) %>% unlist()
sales.hood.98th$lat <- lapply(sales.hood.98th$location.coordinates, function(x) x[2]) %>% unlist()
sales.hood.98th.geo <- sales.hood.98th %>% filter(!is.na(long))
sales.hood.98th.geo <- SpatialPointsDataFrame(
sales.hood.98th.geo %>% select(long, lat),
sales.hood.98th.geo,
proj4string = CRS("+proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0"))
sales.hood.98th.geo <-
spTransform(
sales.hood.98th.geo,
CRSobj = CRS("+init=epsg:4326 +proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0")
)
Map
library(htmltools)
hoods.labels <- paste0(
hoods$label,
"<br>Median Sales, 2015-2017: ", as.character(hoods$hood.median)
)
sale.labels <- paste0(
sales.hood.98th.geo$`House #`, " ",
sales.hood.98th.geo$`Street Name`, " ",
sales.hood.98th.geo$Suffix,
"<br>Sale Price in 2018: ",
as.character(sales.hood.98th.geo$`Sales Price`),
"<br>New Owner: ", sales.hood.98th.geo$new.owner
)
leaflet() %>%
setView(lng = -76.6, lat = 39.3, zoom = 11) %>%
addProviderTiles(providers$Stamen.TonerLite) %>%
addPolygons(data = hoods,
weight = 2,
color = "black",
opacity = 0.5,
fillOpacity = 0,
label = ~lapply(hoods.labels, HTML)) %>%
addCircleMarkers(data = sales.hood.98th.geo,
radius = 2,
label = ~lapply(sale.labels, HTML))
Full list
sales.hood.98th
Detect jumps
#look for temporal jumps in prices
99th Percentile
Criteria & Results
sales.hood.99th <- sales %>%
left_join(sales.summary.15_17.by.hood,
by = c("neighborhood" = "neighborhood")) %>%
filter(year(deed.date) == 2018,
hood.n >= 20,
`Sales Price` >= hood.99th,
`How Conveyed` == 1,
!grepl("NOT", rescode)) %>%
arrange(neighborhood)
result.sales <- nrow(sales.hood.99th)
There are 167 sales that meet the following criteria:
- Deed date was between January 1, 2018 and October 5, 2018
- Arms-length sale
- Principal residence
- Neighborhood had at least 20 sales
- 99th percentile for sales prices for their neighborhood.
(If this yield isn’t high enough we can bump it down to the 95th percentile.)
sales.hood.99th$long <- lapply(sales.hood.99th$location.coordinates, function(x) x[1]) %>% unlist()
sales.hood.99th$lat <- lapply(sales.hood.99th$location.coordinates, function(x) x[2]) %>% unlist()
sales.hood.99th.geo <- sales.hood.99th %>% filter(!is.na(long))
sales.hood.99th.geo <- SpatialPointsDataFrame(
sales.hood.99th.geo %>% select(long, lat),
sales.hood.99th.geo,
proj4string = CRS("+proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0"))
sales.hood.99th.geo <-
spTransform(
sales.hood.99th.geo,
CRSobj = CRS("+init=epsg:4326 +proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0")
)
Map
library(htmltools)
hoods.labels <- paste0(
hoods$label,
"<br>Median Sales, 2015-2017: ", as.character(hoods$hood.median)
)
sale.labels <- paste0(
sales.hood.99th.geo$`House #`, " ",
sales.hood.99th.geo$`Street Name`, " ",
sales.hood.99th.geo$Suffix,
"<br>Sale Price in 2018: ",
as.character(sales.hood.99th.geo$`Sales Price`),
"<br>New Owner: ", sales.hood.99th.geo$new.owner
)
leaflet() %>%
setView(lng = -76.6, lat = 39.3, zoom = 11) %>%
addProviderTiles(providers$Stamen.TonerLite) %>%
addPolygons(data = hoods,
weight = 2,
color = "black",
opacity = 0.5,
fillOpacity = 0,
label = ~lapply(hoods.labels, HTML)) %>%
addCircleMarkers(data = sales.hood.99th.geo,
radius = 2,
label = ~lapply(sale.labels, HTML))
In Middle Neighborhood, 99th Percentile for Neighborhood, Over $250k
The following criteria are used below:
- Deed date was between January 1, 2018 and October 5, 2018
- Arms-length sale
- Principal residence
- Neighborhood had at least 20 sales
- 99th percentile for sales prices for their neighborhood.
- Sale price over $250,000
mid.hoods <- hmt.hood %>% filter(`Predominant Code Ignoring Non-Residential` %in% c("D", "E", "F", "G", "H"))
sales.99th.mid.hood <- subset(sales.hood.99th.geo, tolower(neighborhood) %in% tolower(mid.hoods$Neighborhood))
mid.hoods.geo <- subset(hoods,
tolower(label) %in% tolower(mid.hoods$Neighborhood))
sales.99th.mid.hood.over.250k <- subset(sales.99th.mid.hood,
`Sales Price` > 250000)
mid.hoods.labels <- paste0(
mid.hoods.geo$label,
"<br>Median Sales, 2015-2017: ", as.character(mid.hoods.geo$hood.median)
)
sale.labels <- paste0(
sales.99th.mid.hood.over.250k$`House #`, " ",
sales.99th.mid.hood.over.250k$`Street Name`, " ",
sales.99th.mid.hood.over.250k$Suffix,
"<br>Sale Price in 2018: ",
as.character(sales.99th.mid.hood.over.250k$`Sales Price`),
"<br>New Owner: ", sales.99th.mid.hood.over.250k$new.owner
)
leaflet() %>%
setView(lng = -76.6, lat = 39.3, zoom = 11) %>%
addProviderTiles(providers$Stamen.TonerLite) %>%
addPolygons(data = hoods,
weight = 2,
color = "black",
opacity = 0.5,
fillOpacity = 0,
label = ~lapply(hoods.labels, HTML)) %>%
addPolygons(data = mid.hoods.geo,
weight = 2,
#color = "black",
opacity = 0.0,
fillOpacity = .2,
fillColor = iteam.colors[3],
label = ~lapply(mid.hoods.labels, HTML)) %>%
addCircleMarkers(data = sales.99th.mid.hood.over.250k,
color = iteam.colors[1],
opacity = 1,
radius = 2,
label = ~lapply(sale.labels, HTML))
Full List
sales.99th.mid.hood.over.250k
---
title: "Recent Sales Outliers"
author: "Justin Elszasz, Mayor's Office of Innovation"
email: "justin.elszasz@baltimorecity.gov"
date: "Thursday, February 28, 2019"
output:
  html_notebook:
    code_folding: hide
    fig_height: 5
    fig_width: 10
    toc: yes
    toc_depth: 2
---

```{r setup, include = FALSE, echo = FALSE, message = FALSE, cache = TRUE}
knitr::opts_chunk$set(echo = FALSE, warning = F, message = F, include = T,
                                 fig.width = 10, fig.height = 5)
```


```{r}
source("../src/00_initialize.R")
sales <- load_sales_data(load.cache = T)

library(RSocrata)
library(sp)
library(leaflet)

real.prop.url <- "https://data.baltimorecity.gov/resource/6act-qzuy.json"
real.prop <- read.socrata(real.prop.url, app_token = VARS$SOCRATA_TOKEN)

hoods <- get_neighborhood_boundaries()
hmt <- load_block_group_data(load.cache = T)
hmt.hood <- read_excel("../data/raw/hmt/HMT by Neighborhood 2017.xlsx")
```

First need to join up the real property data ([Open Baltimore]([http://data.baltimorecity.gov/Financial/Real-Property-Taxes/27w9-urtvto)) the sales data (provided by Steve, and with **deed dates from January 1, 2010 through October 2018**) so we have a neighborhood for as many sales as we can.

```{r}
sales <- sales %>% rename(sales.block = Block, sales.lot = Lot)
real.prop <- real.prop %>% rename(real.block = block, real.lot = lot)
```

```{r}
real.prop <- real.prop %>%
  mutate(real.block.clean = gsub("^0+", "", real.block),
         real.lot.clean = gsub("^0+", "", real.lot))

sales <- sales %>%
  mutate(sales.block.clean = gsub("^0+", "", sales.block),
         sales.lot.clean = gsub("^0+", "", sales.lot))
```

```{r}
sales <- sales %>%
  left_join(real.prop, 
            by = c("sales.block.clean" = "real.block.clean",
                   "sales.lot.clean" = "real.lot.clean")
            )
```

```{r}
sales %>% count(is.na(real.block), is.na(real.lot))
```

1,153 sales didn't match to a block-lot in the real property table, which means that the block-lot jointly was not in the real prop table. 

Also, there are about 16,000 properties in the real prop table that don't have a neighborhood. 

```{r}
real.prop %>% count(is.na(neighborhood))
```

So after joining we end up with 9,428 sales that don't have a neighborhood.

```{r}
sales %>% count(!is.na(neighborhood))
```

The real property table also gives if it is principal residence or not, so we'll also filter for the sales that are for principal residences.

```{r}
sales %>% count(rescode)
```

Distribution of city-wide 2018 sales prices:

```{r}
sales %>%
  filter(year(deed.date) == 2018) %>%
  ggplot(aes(`Sales Price`)) +
  geom_histogram() +
  theme_iteam_google_docs() +
  xlim(c(0, 500000))
```

```{r}
quantile(sales$`Sales Price`, 0.85)
```
 


# Neighborhood Summary Table, 2015-2017

```{r}
meet.criteria <- sales %>%
  filter(year(deed.date) %in% c(2015, 2016, 2017),
         !is.na(neighborhood),
         `How Conveyed` == 1,
         !grepl("NOT", rescode)) %>%
  nrow
```

We have `r meet.criteria` samples to work with that are in 2015-2017, have a neighborhood, were an arms-length sale, and are the principal residence.

```{r}
sales.summary.15_17.by.hood <- sales %>%
  filter(year(deed.date) %in% c(2015, 2016, 2017),
         !is.na(neighborhood),
         `How Conveyed` == 1,
         !grepl("NOT", rescode)) %>%
  group_by(neighborhood) %>%
  summarise(hood.n = n(),
            hood.mean = mean(`Sales Price`),
            hood.median = median(`Sales Price`),
            hood.std = sqrt(sum((`Sales Price`-hood.mean)^2/(hood.n-1))),
            hood.95th = quantile(`Sales Price`, probs = .95),
            hood.98th = quantile(`Sales Price`, probs = .98),
            hood.99th = quantile(`Sales Price`, probs = .99))

sales.summary.15_17.by.hood  
```

Which neighborhoods have less than 20 sales meeting the criteria?

```{r}
sales.summary.15_17.by.hood %>%
  filter(hood.n < 20)
```

84 neighborhoods have less than 20 sales meeting the criteria. We'll exclude them going forward so we have a reasonable sample size.

```{r}
# Join the summaries to the neighborhood boundaries
hoods@data <- hoods@data %>% 
  left_join(sales.summary.15_17.by.hood,
            by = c("label" = "neighborhood"))
```

# 98th Percentile

## Criteria & Results

```{r}
sales.hood.98th <- sales %>%
  left_join(sales.summary.15_17.by.hood,
            by = c("neighborhood" = "neighborhood")) %>%
  filter(year(deed.date) == 2018,
         hood.n >= 20,
         `Sales Price` >= hood.98th,
         `How Conveyed` == 1,
         !grepl("NOT", rescode)) %>%
  arrange(neighborhood)

result.sales <- nrow(sales.hood.98th)
```

**There are `r result.sales` sales that meet the following criteria:**

- Deed date was between January 1, 2018 and October 5, 2018
- Arms-length sale
- Principal residence
- Neighborhood had at least 20 sales
- 98th percentile for sales prices for their neighborhood.

(If this yield isn't high enough we can bump it down to the 95th percentile.)


```{r}
sales.hood.98th$long <- lapply(sales.hood.98th$location.coordinates, function(x) x[1]) %>% unlist()

sales.hood.98th$lat <- lapply(sales.hood.98th$location.coordinates, function(x) x[2]) %>% unlist()
```


```{r}
sales.hood.98th.geo <- sales.hood.98th %>% filter(!is.na(long))
  
sales.hood.98th.geo <- SpatialPointsDataFrame(
  sales.hood.98th.geo %>% select(long, lat), 
  sales.hood.98th.geo,
  proj4string = CRS("+proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0"))

sales.hood.98th.geo <- 
  spTransform(
    sales.hood.98th.geo, 
    CRSobj = CRS("+init=epsg:4326 +proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0")
    )

```

## Map

```{r}
library(htmltools)

hoods.labels <- paste0(
  hoods$label,
  "<br>Median Sales, 2015-2017: ", as.character(hoods$hood.median)
  
)

sale.labels <- paste0(
  sales.hood.98th.geo$`House #`, " ",
  sales.hood.98th.geo$`Street Name`, " ",
  sales.hood.98th.geo$Suffix, 
  "<br>Sale Price in 2018: ", 
  as.character(sales.hood.98th.geo$`Sales Price`),
  "<br>New Owner: ", sales.hood.98th.geo$new.owner
)


leaflet() %>%
  setView(lng = -76.6, lat = 39.3, zoom = 11) %>%
  addProviderTiles(providers$Stamen.TonerLite) %>% 
  addPolygons(data = hoods, 
              weight = 2, 
              color = "black",
              opacity = 0.5,
              fillOpacity = 0, 
              label = ~lapply(hoods.labels, HTML)) %>%
  addCircleMarkers(data = sales.hood.98th.geo, 
                   radius = 2,
                   label = ~lapply(sale.labels, HTML))
```

## Full list

```{r}
sales.hood.98th 
```

# Detect jumps

```{r}
#look for temporal jumps in prices
```

# 99th Percentile

## Criteria & Results

```{r}
sales.hood.99th <- sales %>%
  left_join(sales.summary.15_17.by.hood,
            by = c("neighborhood" = "neighborhood")) %>%
  filter(year(deed.date) == 2018,
         hood.n >= 20,
         `Sales Price` >= hood.99th,
         `How Conveyed` == 1,
         !grepl("NOT", rescode)) %>%
  arrange(neighborhood)

result.sales <- nrow(sales.hood.99th)
```

**There are `r result.sales` sales that meet the following criteria:**

- Deed date was between January 1, 2018 and October 5, 2018
- Arms-length sale
- Principal residence
- Neighborhood had at least 20 sales
- 99th percentile for sales prices for their neighborhood.

(If this yield isn't high enough we can bump it down to the 95th percentile.)


```{r}
sales.hood.99th$long <- lapply(sales.hood.99th$location.coordinates, function(x) x[1]) %>% unlist()

sales.hood.99th$lat <- lapply(sales.hood.99th$location.coordinates, function(x) x[2]) %>% unlist()
```


```{r}
sales.hood.99th.geo <- sales.hood.99th %>% filter(!is.na(long))
  
sales.hood.99th.geo <- SpatialPointsDataFrame(
  sales.hood.99th.geo %>% select(long, lat), 
  sales.hood.99th.geo,
  proj4string = CRS("+proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0"))

sales.hood.99th.geo <- 
  spTransform(
    sales.hood.99th.geo, 
    CRSobj = CRS("+init=epsg:4326 +proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0")
    )

```

## Map

```{r}
library(htmltools)

hoods.labels <- paste0(
  hoods$label,
  "<br>Median Sales, 2015-2017: ", as.character(hoods$hood.median)
  
)

sale.labels <- paste0(
  sales.hood.99th.geo$`House #`, " ",
  sales.hood.99th.geo$`Street Name`, " ",
  sales.hood.99th.geo$Suffix, 
  "<br>Sale Price in 2018: ", 
  as.character(sales.hood.99th.geo$`Sales Price`),
  "<br>New Owner: ", sales.hood.99th.geo$new.owner
)


leaflet() %>%
  setView(lng = -76.6, lat = 39.3, zoom = 11) %>%
  addProviderTiles(providers$Stamen.TonerLite) %>% 
  addPolygons(data = hoods, 
              weight = 2, 
              color = "black",
              opacity = 0.5,
              fillOpacity = 0, 
              label = ~lapply(hoods.labels, HTML)) %>%
  addCircleMarkers(data = sales.hood.99th.geo, 
                   radius = 2,
                   label = ~lapply(sale.labels, HTML))
```

```{r}
emily.list <- c("4011 BARRINGTON",
                "DORCHESTER",
                "2319 MONTICELLO")

sales.hood.98th.geo@data %>% 
  filter(grepl(paste(emily.list, collapse="|"), propertyaddress))
```

# In Middle Neighborhood, 99th Percentile for Neighborhood, Over $250k

The following criteria are used below:

- Deed date was between January 1, 2018 and October 5, 2018
- Arms-length sale
- Principal residence
- Neighborhood had at least 20 sales
- 99th percentile for sales prices for their neighborhood.
- Sale price over $250,000


```{r}
mid.hoods <- hmt.hood %>% filter(`Predominant Code Ignoring Non-Residential` %in% c("D", "E", "F", "G", "H"))

sales.99th.mid.hood <- subset(sales.hood.99th.geo, tolower(neighborhood) %in% tolower(mid.hoods$Neighborhood))
```


```{r}

mid.hoods.geo <- subset(hoods, 
                        tolower(label) %in% tolower(mid.hoods$Neighborhood))

sales.99th.mid.hood.over.250k <- subset(sales.99th.mid.hood,
                                        `Sales Price` > 250000)

mid.hoods.labels <- paste0(
  mid.hoods.geo$label,
  "<br>Median Sales, 2015-2017: ", as.character(mid.hoods.geo$hood.median)
  
)

sale.labels <- paste0(
  sales.99th.mid.hood.over.250k$`House #`, " ",
  sales.99th.mid.hood.over.250k$`Street Name`, " ",
  sales.99th.mid.hood.over.250k$Suffix, 
  "<br>Sale Price in 2018: ", 
  as.character(sales.99th.mid.hood.over.250k$`Sales Price`),
  "<br>New Owner: ", sales.99th.mid.hood.over.250k$new.owner
)


leaflet() %>%
  setView(lng = -76.6, lat = 39.3, zoom = 11) %>%
  addProviderTiles(providers$Stamen.TonerLite) %>% 
  addPolygons(data = hoods, 
              weight = 2, 
              color = "black",
              opacity = 0.5,
              fillOpacity = 0, 
              label = ~lapply(hoods.labels, HTML)) %>%
  addPolygons(data = mid.hoods.geo, 
              weight = 2, 
              #color = "black",
              opacity = 0.0,
              fillOpacity = .2,
              fillColor = iteam.colors[3],
              label = ~lapply(mid.hoods.labels, HTML)) %>%
  addCircleMarkers(data = sales.99th.mid.hood.over.250k, 
                   color = iteam.colors[1],
                   opacity = 1,
                   radius = 2,
                   label = ~lapply(sale.labels, HTML))
```

## Full List

```{r}
sales.99th.mid.hood.over.250k
```

